Project combines the following main steps:
### GET S&P 500 constituents from Wikipedia ###
def get_sp500_cons():
'''
Extract S&P 500 companies from wikipedia and store tickers and Sectors / Industries as df
Returns a df of tickers, name, sectors, and industries
'''
URL = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
df = pd.read_html(URL)[0]
df['Symbol'] = df['Symbol'].str.replace('.','-')
df = df.drop(['Headquarters Location','Date added','CIK','Founded'],axis=1)
df = df.sort_values(by=['GICS Sector','GICS Sub-Industry'])
df = df.set_index('Symbol')
df.dropna(inplace=True)
return df
# Tickers scrapping
data = get_sp500_cons()
data.head()
/var/folders/sr/2_q1kwss0cg7rt_ny655psg80000gn/T/ipykernel_55391/3925123783.py:9: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
| Security | GICS Sector | GICS Sub-Industry | |
|---|---|---|---|
| Symbol | |||
| IPG | Interpublic Group of Companies (The) | Communication Services | Advertising |
| OMC | Omnicom Group | Communication Services | Advertising |
| WBD | Warner Bros. Discovery | Communication Services | Broadcasting |
| CHTR | Charter Communications | Communication Services | Cable & Satellite |
| CMCSA | Comcast | Communication Services | Cable & Satellite |
### GET prices from yfinance ###
def get_prices(tickers,start='1995-12-31'):
'''
Dowload prices from yfinance from a list of tickers.
Returns a df of daily prices
'''
prices = yf.download(tickers, start=start,interval='1d',)
prices = prices['Adj Close']
# fwd fill last prices for missing daily prices
prices = prices.asfreq('D').ffill()
return prices
# Collect prices from yfinance into stock_data df
ticker_list = data.index.to_list()
start_date = '2015-01-01'
stock_data = get_prices(tickers=ticker_list,start=start_date)
[*********************100%***********************] 503 of 503 completed
stock_data.head()
| A | AAL | AAP | AAPL | ABBV | ABC | ABT | ACGL | ACN | ADBE | ... | WYNN | XEL | XOM | XRAY | XYL | YUM | ZBH | ZBRA | ZION | ZTS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||||||
| 2014-12-31 | 38.084980 | 50.814617 | 147.124588 | 24.767366 | 45.798294 | 78.874519 | 38.343540 | 19.700001 | 77.527664 | 72.699997 | ... | 130.989746 | 27.941639 | 62.913483 | 49.859379 | 34.242157 | 44.820740 | 103.148308 | 77.410004 | 23.585791 | 40.600731 |
| 2015-01-01 | 38.084980 | 50.814617 | 147.124588 | 24.767366 | 45.798294 | 78.874519 | 38.343540 | 19.700001 | 77.527664 | 72.699997 | ... | 130.989746 | 27.941639 | 62.913483 | 49.859379 | 34.242157 | 44.820740 | 103.148308 | 77.410004 | 23.585791 | 40.600731 |
| 2015-01-02 | 37.823868 | 51.079914 | 146.459610 | 24.531767 | 46.113235 | 79.136963 | 38.241341 | 19.496668 | 77.119659 | 72.339996 | ... | 129.343063 | 28.097219 | 63.172077 | 48.605164 | 34.251148 | 44.513096 | 102.393463 | 77.430000 | 23.403791 | 40.864918 |
| 2015-01-03 | 37.823868 | 51.079914 | 146.459610 | 24.531767 | 46.113235 | 79.136963 | 38.241341 | 19.496668 | 77.119659 | 72.339996 | ... | 129.343063 | 28.097219 | 63.172077 | 48.605164 | 34.251148 | 44.513096 | 102.393463 | 77.430000 | 23.403791 | 40.864918 |
| 2015-01-04 | 37.823868 | 51.079914 | 146.459610 | 24.531767 | 46.113235 | 79.136963 | 38.241341 | 19.496668 | 77.119659 | 72.339996 | ... | 129.343063 | 28.097219 | 63.172077 | 48.605164 | 34.251148 | 44.513096 | 102.393463 | 77.430000 | 23.403791 | 40.864918 |
5 rows × 503 columns
### COMPUTE Returns on specified frequency ###
def get_returns(prices_df,freq='D'):
'''
Input daily prices df of prices. Use freq as 'D','B', 'W', or 'M' for daily, business, weekly, or monthly.
Output returns_df in selected frequency
'''
prices_d = prices_df.dropna(axis=0,how='all')
prices_d = prices_d.dropna(axis=1)
prices_res = prices_d.resample(freq).last()
prices_res = prices_res.dropna(axis=1)
returns = prices_res.pct_change().dropna()
return returns
# Set Frequency of returns
freq = 'W'
# Compute returns
returns = get_returns(stock_data,freq=freq)
returns.head(3)
| A | AAL | AAP | AAPL | ABBV | ABC | ABT | ACGL | ACN | ADBE | ... | WYNN | XEL | XOM | XRAY | XYL | YUM | ZBH | ZBRA | ZION | ZTS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||||||
| 2015-01-11 | 0.000739 | -0.035059 | 0.010974 | 0.024513 | -0.001669 | 0.028079 | 0.006682 | 0.010600 | 0.010581 | -0.006912 | ... | 0.014705 | 0.001661 | -0.007864 | 0.015983 | -0.071166 | 0.015342 | 0.049916 | 0.040165 | -0.078827 | 0.021704 |
| 2015-01-18 | -0.057650 | -0.042484 | -0.064255 | -0.053745 | -0.011485 | -0.006237 | -0.010498 | 0.002368 | -0.009913 | -0.001531 | ... | -0.013821 | 0.023217 | -0.010641 | -0.037908 | -0.025446 | -0.008422 | -0.010236 | 0.029426 | -0.049117 | -0.000678 |
| 2015-01-25 | 0.014640 | 0.118048 | 0.038533 | 0.065950 | -0.032693 | 0.024995 | -0.014161 | 0.002700 | 0.003712 | 0.032483 | ... | -0.006667 | 0.019989 | -0.002524 | 0.002561 | 0.020308 | 0.023195 | -0.000171 | 0.015438 | 0.001210 | -0.000385 |
3 rows × 479 columns
### SAMPLE N stocks => returns_df ###
def get_sample(returns,N=returns.shape[1]):
'''
Sample N stocks randomly from stocks in returns df columns
Return a returns_df of random sampled stocks
'''
sample = random.sample(returns.columns.to_list(),N)
return returns[sample]
# Sample N stocks
returns_df = get_sample(returns,N=350)
returns_df.head(3)
| UAL | UNP | APD | VZ | TXT | NSC | BRO | GOOG | JBHT | AEE | ... | PAYC | PNR | APA | VRSK | BG | DOV | ETR | FSLR | DVN | AVGO | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||||||
| 2015-01-11 | -0.015074 | -0.030605 | -0.014234 | 0.007521 | 0.009722 | -0.054787 | -0.002764 | -0.054572 | -0.028959 | -0.021527 | ... | -0.024175 | -0.023396 | -0.049193 | -0.013756 | -0.018649 | -0.032509 | 0.000913 | -0.007856 | -0.009022 | 0.048056 |
| 2015-01-18 | 0.006275 | -0.026787 | -0.025460 | 0.026305 | 0.006811 | 0.003974 | -0.019711 | 0.024004 | -0.012251 | 0.034222 | ... | -0.076288 | -0.009828 | 0.025704 | -0.003012 | 0.027946 | -0.001005 | 0.010024 | -0.078733 | 0.004800 | -0.010295 |
| 2015-01-25 | 0.111787 | 0.073191 | 0.026841 | -0.017504 | -0.011663 | 0.011585 | -0.002513 | 0.062726 | 0.040090 | -0.004512 | ... | 0.151128 | -0.003567 | 0.014194 | 0.035135 | -0.000653 | 0.002731 | 0.010488 | 0.044695 | -0.008896 | 0.030437 |
3 rows × 350 columns
### RUN PCA ###
def train_PCA(returns, n_comp=20):
"""
From returns df, compute n_comp PCA and returns W,pca,X_proj,cum_var
"""
# Set X
X = returns
# Standardize returns into X_scal
scaler = StandardScaler()
scaler.fit(X)
X_scal = pd.DataFrame(scaler.transform(X), columns=X.columns, index=X.index)
# Run PCA
pca = PCA(n_comp)
pca.fit(X_scal)
# Get PCA loadings
W = pca.components_
W = pd.DataFrame(W.T,
index=X.columns,
columns=pca.get_feature_names_out())
# Print cum explained variance by n_comp components
cum_var = np.cumsum(pca.explained_variance_ratio_)
print(f'Total explained variance:{np.round(cum_var.max(),2)} with {n_comp} PCs')
X_proj = pca.transform(X_scal)
X_proj = pd.DataFrame(X_proj, columns=pca.get_feature_names_out(),index=X_scal.index)
return W,pca,X_proj,cum_var
# Run PCA on returns df
W,pca,X_proj,cum_var = train_PCA(returns_df,n_comp=20)
# Plot total cumulative explained variance using n_comp PCs
pyo.init_notebook_mode(connected=True)
import plotly.express as px
fig = px.bar(cum_var,
title='Cumulative Variance explained by PCs',
)
fig.update_layout(xaxis_title='PCs', yaxis_title='Cum Var',showlegend=False,)
fig.show()
Total explained variance:0.66 with 20 PCs
# Quick Viz of PC orthogonality
plt.figure(figsize=(12,8))
sns.heatmap(X_proj.corr(), cmap='coolwarm')
plt.title('Correlation Heatmap - Princpal Components')
plt.show()
# Plot PC0 vs PC1 - Use Sectors as colors
df = W.join(data[['Security','GICS Sector']])
fig = px.scatter(df, x='pca0', y='pca1',
title='PCA - PC1 vs PC2',
color='GICS Sector',
hover_data=[df.index,df.Security],
)
fig.show()
# Plot top 3 PCs
fig = px.scatter_3d(df, x='pca0', y='pca1', z='pca2',
title='PCA - Top 3 PCs',
color='GICS Sector',
hover_data=[df.index,df.Security],
)
fig.show()
### GET K-Means clusters labels based on PCA ###
def get_pcakmean_clusters(W,k=11):
"""
From W matrix of PCA loadings for each stock,
train K-Means to cluster stocks in k clusters.
Returns clusters labels assigned to each stock in a df
"""
kmeans = KMeans(n_clusters=k)
kmeans.fit(W)
labels = kmeans.labels_
# Assign stocks to clusters
clusters_k = pd.DataFrame(labels,index=W.index,columns=['cluster'])
clusters_k = clusters_k.sort_values(by='cluster')
return clusters_k
# Set number of clusters
k = 11 # in-line with GICS Sector classification
# PCA + K-Means cluster labels
clusters_pcak = get_pcakmean_clusters(W,k=k)
### RUN HCA ###
def get_hca_clusters(X,k=returns_df.shape[1]/10):
"""
From X returns_df compute Z linkage matrix on C cov Matrix,
use fcluster to split linkage matrix into k clusters
Returns k clusters labels assigned to each stock in a df
"""
# Build Covariance Matrix C
C = X.cov()
# Compute the linkage matrix using Ward's method
Z = linkage(C, method='ward', metric='euclidean')
# Use the fcluster function to split the dendogram into k clusters
labels = fcluster(Z, k, criterion='maxclust')
# Assign cluster labels to stocks
clusters_h = pd.DataFrame(labels,index=X.columns,columns=['cluster'])
clusters_h = clusters_h.sort_values(by='cluster')
return clusters_h, Z
# Set number of clusters
k = 11 # in-line with GICS Sector classification
# Get clusters and linkage matrix Z
clusters_h, Z = get_hca_clusters(returns_df,k=k)
# Plot Dendogram chart
plt.figure(figsize=(16,8))
dendrogram(Z)
plt.title(f'Dendrogram of {returns_df.shape[1]} sampled stocks')
plt.xlabel('Stocks')
plt.ylabel('Distance')
plt.show()
# Plot Correlation Heatmaps
fig.show()
# PCA-K-Means clustering details
clusters = clusters_pcak
df = clusters.join(data)
KMEAN_ret = returns_df.T.join(df).groupby('cluster').mean().T
grouped = df.groupby(['cluster', 'GICS Sector', 'Security']).size().reset_index(name='count')
fig = px.bar(grouped, x='cluster', y='count', color='GICS Sector', title=f'PCA & K-Means clustering',
barmode='stack',hover_data=['Security'],text='Security')
fig.update_layout(xaxis_title='Cluster', yaxis_title='Security count', hovermode='closest',width=800)
fig.show()
# HCA Clustering details
clusters = clusters_h
df = clusters.join(data)
HCA_ret = returns_df.T.join(df).groupby('cluster').mean().T
grouped = df.groupby(['cluster', 'GICS Sector', 'Security']).size().reset_index(name='count')
fig = px.bar(grouped, x='cluster', y='count', color='GICS Sector', title=f'Hierarchical clustering',
barmode='stack',hover_data=['Security'],text='Security')
fig.update_layout(xaxis_title='Cluster', yaxis_title='Security count', hovermode='closest',width=800)
fig.show()
# GICS Clustering details
clusters = returns_df.T
df = clusters.join(data)
GICS_ret = df.groupby('GICS Sector').mean().T
grouped = df.groupby(['GICS Sector', 'Security']).size().reset_index(name='count')
fig = px.bar(grouped, x='GICS Sector', y='count', color='GICS Sector', title=f'GICS Sectors clustering',
barmode='stack',hover_data=['Security'],text='Security')
fig.update_layout(xaxis_title='Cluster', yaxis_title='Security count', hovermode='closest',width=800)
fig.show()
# Computing and comparing within-cluster vs. between-cluster correlations
corr_comp_df
| PCA-KMEAN | HCA | GICS | |
|---|---|---|---|
| Within Corr | 0.491019 | 0.504967 | 0.494369 |
| Between Corr | 0.681807 | 0.608858 | 0.693349 |
About me